
# This file contains R script that replicates the results reported in
# Hana M. Broulíková, Peter Huber, Josef Montag, and Petr Sunega. 2018.
# "Homeownership, Mobility, and Unemployment: Evidence from Housing
# Privatization." 

# Author: Josef Montag | josef.montag@gmail.com
# ISE KBTU

# January 2, 2018 

# The code replicates only results pertaining to GSOEP data (version 32.1), 
# that is Tables 7 through 10. (For replication of the results pertaining to
# LiTS data (Tables 1 through 7, and A1 to A3), see the code in file
# "2_Replication_Code_LiTS.R.")

# Note that the GSOEP 2015 data is not part of this replication.
# The data can be obtained free of charge from the DIW Berlin, see
# https://www.diw.de/en/soep (last accessed January 2, 2018) for more
# information.

# The GSOEP data files required by this code are: 'hgen.csv', 'pgen.csv',
# 'pequiv.csv', and 'pkal.csv', 'hbrutto.csv', 'hpfad.csv.' The code should run
# after you set the working directory to the folder containing these files.

# You need to have installed the libraries required by the code in the section
# "Preliminaries" below.

# The script was tested to work in R version 3.4.2 (2017-09-28), R Core Team.
# 2017. "R: A language and environment for statistical computing." R Foundation
# for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.

# This file is sectioned using Vim wrapping delimiters: 
# "{{{1" denotes a top-level heading
# "{{{2" denotes a second-level heading

# Preliminaries {{{1

# setwd('') # Set the directory with the files

options(width = 350, digits = 3, scipen = 10, dplyr.width = Inf, 
        tibble.print_min = 20)

library(plyr)
library(tidyverse)
library(stargazer)
library(lfe)

# Load data {{{1

# Load main household data file "hgen.csv"

hgen <- read_csv('hgen.csv')

names(hgen) <- sub('^hg', '', names(hgen))

# Load individual data files 'pgen.csv', 'pequiv.csv', and 'pkal.csv'

pgen <- read_csv('pgen.csv')
names(pgen) <- sub('^pg', '', names(pgen))

pequiv <- read_csv('pequiv.csv')

pkal <- read_csv('pkal.csv')
pkal$year <- pkal$syear - 1 # Match the year of pkal and rest of data

names(pkal)[grep('1d', names(pkal))]

# Finding potential privatizers in household data {{{1

# Select useful HH data:
hgen.sel <- select(hgen, cid, hid, syear, owner, moveyr, acquis, cnstyrmin, 
                   cnstyrmax, rsubs, subsid, osubs, reduc)

# Find HHs who are potential privatizers:
privatizers <- 
  hgen.sel %>%
  arrange(hid, syear) %>%
  group_by(hid) %>%
  filter(
    (# Cond. 1: Did not move in 4-year window
     lag(moveyr, 1)  == moveyr & 
     lead(moveyr, 1) == moveyr &
     lead(moveyr, 2) == moveyr
     ) & 
    (# Cond. 2: Changed from renter (first 2 years) to owner (last 2 years)
     lag(owner, 1)  == 2 &
     owner          == 2 & 
     lead(owner, 1) == 1 &
     lead(owner, 2) == 1
     ) &
    (# Cond. 3: Did not inherit/bought new
     !lead(acquis) %in% c(-2, 2, 3) & !lead(acquis, 2) %in% c(-2, 2, 3)
     )
  ) %>%
  filter(!gdata::duplicated2(hid)) %>%
  mutate(privyr = syear + 1)

# Attach potential privatizers to HH data and define related vars:
hh <- hgen.sel %>%
  left_join(select(privatizers, hid, privyr)) %>% 
  mutate(privatizer = hid %in% privatizers$hid,
         privatized = privatizer & syear >= privyr)

# Attach relevant HH vars from other data files:
hh <- hh %>%
  left_join(select(read_csv('hbrutto.csv'), hid, syear, htyp, hader, datumy)
  ) %>%
  left_join(select(read_csv('hpfad.csv'), hid, hsample)
  ) %>%
    group_by(hid) %>%
    mutate(syear.max = max(syear), 
           syear.min = min(syear)) %>%
    ungroup() %>%
    mutate(cnstyrmin = case_when(cnstyrmin <= -1 ~ NA_integer_, 
                                 cnstyrmin ==  0 ~ as.integer(1900),
                                 TRUE ~ cnstyrmin),
           cnstyrmax = case_when(cnstyrmax <= -1 ~ NA_integer_,
                                 TRUE ~ cnstyrmax),
           priv.trnd = ifelse(privatized, syear - privyr, 0)
           )

# Create the individual-level analysis dataset {{{1

p <-
  pgen %>%
  left_join(hh) %>%
  mutate(unemp = stib == 12,
         unemp1 = lfs == 6,
         lf = lfs %in% c(6, 11:12),
         selfemp = stib %in% 410:440,
         howner = owner == 1,
         renter = owner == 2
         ) %>%
  group_by(hid) %>%
  mutate(switcher = min(howner) != max(howner) & !privatizer) %>%
  ungroup() %>%
  select(hid, pid, 
         hsample,
         dplyr::contains('syear'), moveyr,
         jobch, 
         jobten = erwzeit,
         lf, 
         selfemp,
         autono,
         dplyr::contains('unemp'), 
         dplyr::contains('priv'), 
         howner, renter, switcher,
         nation, month) %>%
  left_join(pequiv %>%
            select(cid, hid, pid, syear,
                   age      = d11101, 
                   female   = d11102ll, 
                   married  = d11104, 
                   reltoh   = d11105, 
                   hhsize   = d11106, 
                   nchild   = d11107, 
                   hschool  = d11108, 
                   educyrs  = d11109,
                   bundesl  = l11101,
                   eastger  = l11102,
                   wrkhrs   = e11101,
                   comm.exp = itray,
                   hweight  = w11102)
            ) %>%
  mutate_each(funs(replace(., . == -1, NA))) %>%
  left_join(pkal %>% 
            select(cid, hid, pid, syear = year, unemp.months = kal1d02)) %>%
  mutate(female  = female == 2,
         married = married == 1,
         reltoh  = case_when(reltoh == 1 ~ 'Head',
                             reltoh == 2 ~ 'Partner',
                             reltoh %in% 3:5 ~ ''
                             ),
         hschool = case_when(hschool == 1 ~ 'Less than HS', 
                             hschool == 2 ~ 'HS', 
                             hschool == 3 ~ 'More than HS' 
                             ),
         syearXage = paste0(syear, '-', age),
         Region = case_when(eastger == 1 ~ 'West Germany',
                            eastger == 2 ~ 'East Germany'
                            ),
         eastger = eastger == 2,
         unemp.months = case_when(unemp.months %in% c(-1, -3) ~ as.integer(NA),
                                  unemp.months == -2 ~ as.integer(0),
                                  T ~ unemp.months),
         jobch = jobch == 3,
         jobten = case_when(jobten %in% c(-2, -3) ~ as.double(NA),
                            T ~ jobten),
         autono = ifelse(autono < 0, NA, autono)
         ) %>%
  group_by(pid) %>%
  mutate(moved = moveyr != lag(moveyr) & howner == lag(howner),
         moved.bundesl = bundesl != lag(bundesl) & howner == lag(howner),
         moved.region = Region != lag(Region) & howner == lag(howner),
         jobten = jobten - lag(jobten)
         ) %>%
  ungroup()

# Subset data
p <- filter(p, eastger & !is.na(privatizer))

# TABLE 7: Summary statistics {{{1

options(digits = 5)

# Compute summary statistics:
ss <- p %>%
  select(privatizer, moved, moved.bundesl, moved.region, unemp, unemp.months,
         lf, wrkhrs, selfemp, howner, renter,
         privatized, privyr, age, female, nchild, educyrs) %>%
  group_by(privatizer) %>%
  summarize_all(funs(mean(., na.rm = TRUE), sd(., na.rm = TRUE)))

# Compute number of observations:
N <- p %>%
  group_by(privatizer) %>%
  summarize(n())

# Compute number of individuals:
Np <- p %>%
  group_by(pid) %>%
  slice(1) %>%
  group_by(privatizer) %>%
  summarize(n())

ss.means <- t(ss[c(1, grep('mean', names(ss)))])
ss.sds <- t(ss[c(1, grep('sd', names(ss)))])

ss <- cbind(ss.means[, 1], ss.sds[, 1], ss.means[, 2], ss.sds[, 2])[-1, ]
ss <- round(ss, 3)
ss <- rbind(ss, N = c(N[1, 2], '', N[2, 2], ''))
ss <- rbind(ss, Np = c(Np[1, 2], '', Np[2, 2], ''))
ss['privyr_mean', ][c(1, 2)] <- '-'
colnames(ss) <- rep(c('Mean', 'Std. Dev.'), 2)
rownames(ss) <- sub('_mean$', '', rownames(ss))

var.labels <- c(moved         = 'Moved residence (=1)',
                moved.bundesl = 'Moved to another federal state (=1)',
                moved.region  = 'Moved between East and West (=1)',
                unemp         = 'Unemployed (=1)',
                unemp.months  = 'Months unemployed',
                lf            = 'Active in labor force (=1)',
                wrkhrs        = 'Yearly hours worked',
                selfemp       = 'Self-employed (=1)',
                howner        = 'Homeowner (=1)',
                renter        = 'Renter (=1)',
                privatized    = 'Potential privatizer (=1)',
                privyr        = 'Year of quasi privatization',
                age           = 'Age',
                female        = 'Female (=1)',
                nchild        = 'Number of children',
                educyrs       = 'Years of education',
                N             = 'SUB',
                Np            = 'Number of individuals')
rownames(ss) <- var.labels[rownames(ss)]

# Print TABLE 7
ss

# TABLEs 8, 9, and 10 {{{1

# Regressions {{{2

# Define outcome varialbes:
outcomes <- list(models1 = c('moved', 'moved.bundesl', 'moved.region'),
                 models2 = c('unemp', 'unemp.months'),
                 models3 = c('lf', 'wrkhrs', 'selfemp')) 

# Set the loop and estimate the models
regs <- sapply(outcomes, USE.NAMES = TRUE, simplify = FALSE, function(outs) {
          sapply(outs, USE.NAMES = TRUE, simplify = FALSE, function(out) {
                 outcome <- parse(text = out)
                 lm1 <- felm(eval(outcome) ~ howner | syear + age |
                             0 | pid, p)
                 lm2 <- felm(eval(outcome) ~ howner | pid + syear + age |
                             0 | pid, p)         
                 lm3 <- felm(eval(outcome) ~ privatized | 
                                             syear + age | 
                             0 | pid, p)
                 lm4 <- felm(eval(outcome) ~ privatized | 
                                             pid + syear + age |
                             0 | pid, p) 
                 list(lm1, lm2, lm3, lm4)
              })
              })
                                                    
# Print TABLEs 8, 9, and 10 {{{2

lapply(regs, function(models) {
  r <- length(names(models))
  stargazer(models,
            type = 'text',
            keep.stat = c('adj.rsq', 'n'),
            star.char = c('+', '*'),
            star.cutoffs = c(.05, .01),
            column.labels = rep(names(models), each = r),
            add.lines = list(c('Individual fixed effects', 
                               paste0('\\multicolumn{1}{c}{',
                                      rep(c('--', 'Yes',  '--', 'Yes'), r),
                                      '}'
                                      )
                               ),
                             c('Year effects $\\times$ Age effects', 
                               paste0('\\multicolumn{1}{c}{',
                                      rep('Yes', 4 * r),
                                      '}'
                                      )
                               )),
            digits = 3,
            no.space = TRUE,
            align = TRUE
          ) 
              }) -> dump

